{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Covertype Data Set Preprocessing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "import csv\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "from sklearn.model_selection import train_test_split"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Set the paths"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "FULL_DATASET = '../covertype.csv'\n",
    "SMALL_DATASET= '../covertype_small.csv'\n",
    "TRAINING_DATASET='../covertype_training.csv'\n",
    "TRAINING_DATASET_WITH_MISSING = '../covertype_training_missing.csv'\n",
    "EVALUATION_DATASET='../covertype_evaluation.csv'\n",
    "EVALUATION_DATASET_WITH_ANOMALIES='../covertype_evaluation_anomalies.csv'\n",
    "SERVING_DATASET='../covertype_serving.csv'\n",
    "\n",
    "ORIGINAL_DATASET_PATH = 'gs://workshop-datasets/covertype/orig/covtype.data'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Preprocess the original dataset"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load the dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(581012, 55)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>9</th>\n",
       "      <th>...</th>\n",
       "      <th>45</th>\n",
       "      <th>46</th>\n",
       "      <th>47</th>\n",
       "      <th>48</th>\n",
       "      <th>49</th>\n",
       "      <th>50</th>\n",
       "      <th>51</th>\n",
       "      <th>52</th>\n",
       "      <th>53</th>\n",
       "      <th>54</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2596</td>\n",
       "      <td>51</td>\n",
       "      <td>3</td>\n",
       "      <td>258</td>\n",
       "      <td>0</td>\n",
       "      <td>510</td>\n",
       "      <td>221</td>\n",
       "      <td>232</td>\n",
       "      <td>148</td>\n",
       "      <td>6279</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2590</td>\n",
       "      <td>56</td>\n",
       "      <td>2</td>\n",
       "      <td>212</td>\n",
       "      <td>-6</td>\n",
       "      <td>390</td>\n",
       "      <td>220</td>\n",
       "      <td>235</td>\n",
       "      <td>151</td>\n",
       "      <td>6225</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2804</td>\n",
       "      <td>139</td>\n",
       "      <td>9</td>\n",
       "      <td>268</td>\n",
       "      <td>65</td>\n",
       "      <td>3180</td>\n",
       "      <td>234</td>\n",
       "      <td>238</td>\n",
       "      <td>135</td>\n",
       "      <td>6121</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2785</td>\n",
       "      <td>155</td>\n",
       "      <td>18</td>\n",
       "      <td>242</td>\n",
       "      <td>118</td>\n",
       "      <td>3090</td>\n",
       "      <td>238</td>\n",
       "      <td>238</td>\n",
       "      <td>122</td>\n",
       "      <td>6211</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2595</td>\n",
       "      <td>45</td>\n",
       "      <td>2</td>\n",
       "      <td>153</td>\n",
       "      <td>-1</td>\n",
       "      <td>391</td>\n",
       "      <td>220</td>\n",
       "      <td>234</td>\n",
       "      <td>150</td>\n",
       "      <td>6172</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 55 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     0    1   2    3    4     5    6    7    8     9   ...  45  46  47  48  \\\n",
       "0  2596   51   3  258    0   510  221  232  148  6279  ...   0   0   0   0   \n",
       "1  2590   56   2  212   -6   390  220  235  151  6225  ...   0   0   0   0   \n",
       "2  2804  139   9  268   65  3180  234  238  135  6121  ...   0   0   0   0   \n",
       "3  2785  155  18  242  118  3090  238  238  122  6211  ...   0   0   0   0   \n",
       "4  2595   45   2  153   -1   391  220  234  150  6172  ...   0   0   0   0   \n",
       "\n",
       "   49  50  51  52  53  54  \n",
       "0   0   0   0   0   0   5  \n",
       "1   0   0   0   0   0   5  \n",
       "2   0   0   0   0   0   2  \n",
       "3   0   0   0   0   0   2  \n",
       "4   0   0   0   0   0   5  \n",
       "\n",
       "[5 rows x 55 columns]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.read_csv(ORIGINAL_DATASET_PATH, header=None)\n",
    "print(df.shape)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Configure soil type and wilderness area domains"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "soil_type = [\n",
    "\"1\", \"C2702\", \"Cathedral family - Rock outcrop complex, extremely stony.\",\n",
    "\"2\", \"C2703\", \"Vanet - Ratake families complex, very stony.\",\n",
    "\"3\", \"C2704\", \"Haploborolis - Rock outcrop complex, rubbly.\",\n",
    "\"4\", \"C2705\", \"Ratake family - Rock outcrop complex, rubbly.\",\n",
    "\"5\", \"C2706\", \"Vanet family - Rock outcrop complex complex, rubbly.\",\n",
    "\"6\", \"C2717\", \"Vanet - Wetmore families - Rock outcrop complex, stony.\",\n",
    "\"7\", \"C3501\", \"Gothic family.\",\n",
    "\"8\", \"C3502\", \"Supervisor - Limber families complex.\",\n",
    "\"9\", \"C4201\", \"Troutville family, very stony.\",\n",
    "\"10\", \"C4703\", \"Bullwark - Catamount families - Rock outcrop complex, rubbly.\",\n",
    "\"11\", \"C4704\", \"Bullwark - Catamount families - Rock land complex, rubbly.\",\n",
    "\"12\", \"C4744\", \"Legault family - Rock land complex, stony.\",\n",
    "\"13\", \"C4758\", \"Catamount family - Rock land - Bullwark family complex, rubbly.\",\n",
    "\"14\", \"C5101\", \"Pachic Argiborolis - Aquolis complex.\",\n",
    "\"15\", \"C5151\", \"unspecified in the USFS Soil and ELU Survey.\",\n",
    "\"16\", \"C6101\", \"Cryaquolis - Cryoborolis complex.\",\n",
    "\"17\", \"C6102\", \"Gateview family - Cryaquolis complex.\",\n",
    "\"18\", \"C6731\", \"Rogert family, very stony.\",\n",
    "\"19\", \"C7101\", \"Typic Cryaquolis - Borohemists complex.\",\n",
    "\"20\", \"C7102\", \"Typic Cryaquepts - Typic Cryaquolls complex.\",\n",
    "\"21\", \"C7103\", \"Typic Cryaquolls - Leighcan family, till substratum complex.\",\n",
    "\"22\", \"C7201\", \"Leighcan family, till substratum, extremely bouldery.\",\n",
    "\"23\", \"C7202\", \"Leighcan family, till substratum - Typic Cryaquolls complex.\",\n",
    "\"24\", \"C7700\", \"Leighcan family, extremely stony.\",\n",
    "\"25\", \"C7701\", \"Leighcan family, warm, extremely stony.\",\n",
    "\"26\", \"C7702\", \"Granile - Catamount families complex, very stony.\",\n",
    "\"27\", \"C7709\", \"Leighcan family, warm - Rock outcrop complex, extremely stony.\",\n",
    "\"28\", \"C7710\", \"Leighcan family - Rock outcrop complex, extremely stony.\",\n",
    "\"29\", \"C7745\", \"Como - Legault families complex, extremely stony.\",\n",
    "\"30\", \"C7746\", \"Como family - Rock land - Legault family complex, extremely stony.\",\n",
    "\"31\", \"C7755\", \"Leighcan - Catamount families complex, extremely stony.\",\n",
    "\"32\", \"C7756\", \"Catamount family - Rock outcrop - Leighcan family complex, extremely stony.\",\n",
    "\"33\", \"C7757\", \"Leighcan - Catamount families - Rock outcrop complex, extremely stony.\",\n",
    "\"34\", \"C7790\", \"Cryorthents - Rock land complex, extremely stony.\",\n",
    "\"35\", \"C8703\", \"Cryumbrepts - Rock outcrop - Cryaquepts complex.\",\n",
    "\"36\", \"C8707\", \"Bross family - Rock land - Cryumbrepts complex, extremely stony.\",\n",
    "\"37\", \"C8708\", \"Rock outcrop - Cryumbrepts - Cryorthents complex, extremely stony.\",\n",
    "\"38\", \"C8771\", \"Leighcan - Moran families - Cryaquolls complex, extremely stony.\",\n",
    "\"39\", \"C8772\", \"Moran family - Cryorthents - Leighcan family complex, extremely stony.\",\n",
    "\"40\", \"C8776\", \"Moran family - Cryorthents - Rock land complex, extremely stony.\",\n",
    "]\n",
    "\n",
    "wilderness_area = [\n",
    "\"Rawah\", \"Rawah Wilderness Area\",\n",
    "\"Neota\", \"Neota Wilderness Area\",\n",
    "\"Commanche\", \"Comanche Peak Wilderness Area\",\n",
    "\"Cache\", \"Cache la Poudre Wilderness Area\"\n",
    "]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Map one-hot encoded values to categorical domains"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0         C7745\n",
       "1         C7745\n",
       "2         C4744\n",
       "3         C7746\n",
       "4         C7745\n",
       "          ...  \n",
       "581007    C2703\n",
       "581008    C2703\n",
       "581009    C2703\n",
       "581010    C2703\n",
       "581011    C2703\n",
       "Length: 581012, dtype: object"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "soil = df.loc[:, 14:53].apply(lambda x: soil_type[1::3][x.to_numpy().nonzero()[0][0]], axis=1)\n",
    "soil"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0             Rawah\n",
       "1             Rawah\n",
       "2             Rawah\n",
       "3             Rawah\n",
       "4             Rawah\n",
       "            ...    \n",
       "581007    Commanche\n",
       "581008    Commanche\n",
       "581009    Commanche\n",
       "581010    Commanche\n",
       "581011    Commanche\n",
       "Length: 581012, dtype: object"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "wilderness = df.loc[:, 10:13].apply(lambda x: wilderness_area[0::2][x.to_numpy().nonzero()[0][0]], axis=1)\n",
    "wilderness"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Create a dataset with column names and categorical values replacing one-hot encoded soil type and wilderness areas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Elevation</th>\n",
       "      <th>Aspect</th>\n",
       "      <th>Slope</th>\n",
       "      <th>Horizontal_Distance_To_Hydrology</th>\n",
       "      <th>Vertical_Distance_To_Hydrology</th>\n",
       "      <th>Horizontal_Distance_To_Roadways</th>\n",
       "      <th>Hillshade_9am</th>\n",
       "      <th>Hillshade_Noon</th>\n",
       "      <th>Hillshade_3pm</th>\n",
       "      <th>Horizontal_Distance_To_Fire_Points</th>\n",
       "      <th>Wilderness_Area</th>\n",
       "      <th>Soil_Type</th>\n",
       "      <th>Cover_Type</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2596</td>\n",
       "      <td>51</td>\n",
       "      <td>3</td>\n",
       "      <td>258</td>\n",
       "      <td>0</td>\n",
       "      <td>510</td>\n",
       "      <td>221</td>\n",
       "      <td>232</td>\n",
       "      <td>148</td>\n",
       "      <td>6279</td>\n",
       "      <td>Rawah</td>\n",
       "      <td>C7745</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2590</td>\n",
       "      <td>56</td>\n",
       "      <td>2</td>\n",
       "      <td>212</td>\n",
       "      <td>-6</td>\n",
       "      <td>390</td>\n",
       "      <td>220</td>\n",
       "      <td>235</td>\n",
       "      <td>151</td>\n",
       "      <td>6225</td>\n",
       "      <td>Rawah</td>\n",
       "      <td>C7745</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2804</td>\n",
       "      <td>139</td>\n",
       "      <td>9</td>\n",
       "      <td>268</td>\n",
       "      <td>65</td>\n",
       "      <td>3180</td>\n",
       "      <td>234</td>\n",
       "      <td>238</td>\n",
       "      <td>135</td>\n",
       "      <td>6121</td>\n",
       "      <td>Rawah</td>\n",
       "      <td>C4744</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2785</td>\n",
       "      <td>155</td>\n",
       "      <td>18</td>\n",
       "      <td>242</td>\n",
       "      <td>118</td>\n",
       "      <td>3090</td>\n",
       "      <td>238</td>\n",
       "      <td>238</td>\n",
       "      <td>122</td>\n",
       "      <td>6211</td>\n",
       "      <td>Rawah</td>\n",
       "      <td>C7746</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2595</td>\n",
       "      <td>45</td>\n",
       "      <td>2</td>\n",
       "      <td>153</td>\n",
       "      <td>-1</td>\n",
       "      <td>391</td>\n",
       "      <td>220</td>\n",
       "      <td>234</td>\n",
       "      <td>150</td>\n",
       "      <td>6172</td>\n",
       "      <td>Rawah</td>\n",
       "      <td>C7745</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>581007</th>\n",
       "      <td>2396</td>\n",
       "      <td>153</td>\n",
       "      <td>20</td>\n",
       "      <td>85</td>\n",
       "      <td>17</td>\n",
       "      <td>108</td>\n",
       "      <td>240</td>\n",
       "      <td>237</td>\n",
       "      <td>118</td>\n",
       "      <td>837</td>\n",
       "      <td>Commanche</td>\n",
       "      <td>C2703</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>581008</th>\n",
       "      <td>2391</td>\n",
       "      <td>152</td>\n",
       "      <td>19</td>\n",
       "      <td>67</td>\n",
       "      <td>12</td>\n",
       "      <td>95</td>\n",
       "      <td>240</td>\n",
       "      <td>237</td>\n",
       "      <td>119</td>\n",
       "      <td>845</td>\n",
       "      <td>Commanche</td>\n",
       "      <td>C2703</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>581009</th>\n",
       "      <td>2386</td>\n",
       "      <td>159</td>\n",
       "      <td>17</td>\n",
       "      <td>60</td>\n",
       "      <td>7</td>\n",
       "      <td>90</td>\n",
       "      <td>236</td>\n",
       "      <td>241</td>\n",
       "      <td>130</td>\n",
       "      <td>854</td>\n",
       "      <td>Commanche</td>\n",
       "      <td>C2703</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>581010</th>\n",
       "      <td>2384</td>\n",
       "      <td>170</td>\n",
       "      <td>15</td>\n",
       "      <td>60</td>\n",
       "      <td>5</td>\n",
       "      <td>90</td>\n",
       "      <td>230</td>\n",
       "      <td>245</td>\n",
       "      <td>143</td>\n",
       "      <td>864</td>\n",
       "      <td>Commanche</td>\n",
       "      <td>C2703</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>581011</th>\n",
       "      <td>2383</td>\n",
       "      <td>165</td>\n",
       "      <td>13</td>\n",
       "      <td>60</td>\n",
       "      <td>4</td>\n",
       "      <td>67</td>\n",
       "      <td>231</td>\n",
       "      <td>244</td>\n",
       "      <td>141</td>\n",
       "      <td>875</td>\n",
       "      <td>Commanche</td>\n",
       "      <td>C2703</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>581012 rows × 13 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        Elevation  Aspect  Slope  Horizontal_Distance_To_Hydrology  \\\n",
       "0            2596      51      3                               258   \n",
       "1            2590      56      2                               212   \n",
       "2            2804     139      9                               268   \n",
       "3            2785     155     18                               242   \n",
       "4            2595      45      2                               153   \n",
       "...           ...     ...    ...                               ...   \n",
       "581007       2396     153     20                                85   \n",
       "581008       2391     152     19                                67   \n",
       "581009       2386     159     17                                60   \n",
       "581010       2384     170     15                                60   \n",
       "581011       2383     165     13                                60   \n",
       "\n",
       "        Vertical_Distance_To_Hydrology  Horizontal_Distance_To_Roadways  \\\n",
       "0                                    0                              510   \n",
       "1                                   -6                              390   \n",
       "2                                   65                             3180   \n",
       "3                                  118                             3090   \n",
       "4                                   -1                              391   \n",
       "...                                ...                              ...   \n",
       "581007                              17                              108   \n",
       "581008                              12                               95   \n",
       "581009                               7                               90   \n",
       "581010                               5                               90   \n",
       "581011                               4                               67   \n",
       "\n",
       "        Hillshade_9am  Hillshade_Noon  Hillshade_3pm  \\\n",
       "0                 221             232            148   \n",
       "1                 220             235            151   \n",
       "2                 234             238            135   \n",
       "3                 238             238            122   \n",
       "4                 220             234            150   \n",
       "...               ...             ...            ...   \n",
       "581007            240             237            118   \n",
       "581008            240             237            119   \n",
       "581009            236             241            130   \n",
       "581010            230             245            143   \n",
       "581011            231             244            141   \n",
       "\n",
       "        Horizontal_Distance_To_Fire_Points Wilderness_Area Soil_Type  \\\n",
       "0                                     6279           Rawah     C7745   \n",
       "1                                     6225           Rawah     C7745   \n",
       "2                                     6121           Rawah     C4744   \n",
       "3                                     6211           Rawah     C7746   \n",
       "4                                     6172           Rawah     C7745   \n",
       "...                                    ...             ...       ...   \n",
       "581007                                 837       Commanche     C2703   \n",
       "581008                                 845       Commanche     C2703   \n",
       "581009                                 854       Commanche     C2703   \n",
       "581010                                 864       Commanche     C2703   \n",
       "581011                                 875       Commanche     C2703   \n",
       "\n",
       "        Cover_Type  \n",
       "0                5  \n",
       "1                5  \n",
       "2                2  \n",
       "3                2  \n",
       "4                5  \n",
       "...            ...  \n",
       "581007           3  \n",
       "581008           3  \n",
       "581009           3  \n",
       "581010           3  \n",
       "581011           3  \n",
       "\n",
       "[581012 rows x 13 columns]"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "COLUMN_NAMES = [\n",
    "    'Elevation', \n",
    "    'Aspect', \n",
    "    'Slope', \n",
    "    'Horizontal_Distance_To_Hydrology',\n",
    "    'Vertical_Distance_To_Hydrology',\n",
    "    'Horizontal_Distance_To_Roadways',\n",
    "    'Hillshade_9am',\n",
    "    'Hillshade_Noon',\n",
    "    'Hillshade_3pm',\n",
    "    'Horizontal_Distance_To_Fire_Points',\n",
    "    'Wilderness_Area',\n",
    "    'Soil_Type',\n",
    "    'Cover_Type']\n",
    "\n",
    "df_full = pd.concat([df.loc[:, 0:9], wilderness, soil, df.loc[:, 54]], axis=1, ignore_index=True)\n",
    "df_full.columns = COLUMN_NAMES\n",
    "df_full"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Convert the label to 0-6 range"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_full['Cover_Type'] = df_full['Cover_Type'] - 1"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Save the dataset to CSV file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_full.to_csv(FULL_DATASET, header=True, index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area,Soil_Type,Cover_Type\n",
      "2596,51,3,258,0,510,221,232,148,6279,Rawah,C7745,4\n",
      "2590,56,2,212,-6,390,220,235,151,6225,Rawah,C7745,4\n",
      "2804,139,9,268,65,3180,234,238,135,6121,Rawah,C4744,1\n",
      "2785,155,18,242,118,3090,238,238,122,6211,Rawah,C7746,1\n",
      "2595,45,2,153,-1,391,220,234,150,6172,Rawah,C7745,4\n",
      "2579,132,6,300,-15,67,230,237,140,6031,Rawah,C7745,1\n",
      "2606,45,7,270,5,633,222,225,138,6256,Rawah,C7745,4\n",
      "2605,49,4,234,7,573,222,230,144,6228,Rawah,C7745,4\n",
      "2617,45,9,240,56,666,223,221,133,6244,Rawah,C7745,4\n"
     ]
    }
   ],
   "source": [
    "!head $FULL_DATASET"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create training, validation, testing and serving splits."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Elevation</th>\n",
       "      <th>Aspect</th>\n",
       "      <th>Slope</th>\n",
       "      <th>Horizontal_Distance_To_Hydrology</th>\n",
       "      <th>Vertical_Distance_To_Hydrology</th>\n",
       "      <th>Horizontal_Distance_To_Roadways</th>\n",
       "      <th>Hillshade_9am</th>\n",
       "      <th>Hillshade_Noon</th>\n",
       "      <th>Hillshade_3pm</th>\n",
       "      <th>Horizontal_Distance_To_Fire_Points</th>\n",
       "      <th>Wilderness_Area</th>\n",
       "      <th>Soil_Type</th>\n",
       "      <th>Cover_Type</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2596</td>\n",
       "      <td>51</td>\n",
       "      <td>3</td>\n",
       "      <td>258</td>\n",
       "      <td>0</td>\n",
       "      <td>510</td>\n",
       "      <td>221</td>\n",
       "      <td>232</td>\n",
       "      <td>148</td>\n",
       "      <td>6279</td>\n",
       "      <td>Rawah</td>\n",
       "      <td>C7745</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2590</td>\n",
       "      <td>56</td>\n",
       "      <td>2</td>\n",
       "      <td>212</td>\n",
       "      <td>-6</td>\n",
       "      <td>390</td>\n",
       "      <td>220</td>\n",
       "      <td>235</td>\n",
       "      <td>151</td>\n",
       "      <td>6225</td>\n",
       "      <td>Rawah</td>\n",
       "      <td>C7745</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2804</td>\n",
       "      <td>139</td>\n",
       "      <td>9</td>\n",
       "      <td>268</td>\n",
       "      <td>65</td>\n",
       "      <td>3180</td>\n",
       "      <td>234</td>\n",
       "      <td>238</td>\n",
       "      <td>135</td>\n",
       "      <td>6121</td>\n",
       "      <td>Rawah</td>\n",
       "      <td>C4744</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2785</td>\n",
       "      <td>155</td>\n",
       "      <td>18</td>\n",
       "      <td>242</td>\n",
       "      <td>118</td>\n",
       "      <td>3090</td>\n",
       "      <td>238</td>\n",
       "      <td>238</td>\n",
       "      <td>122</td>\n",
       "      <td>6211</td>\n",
       "      <td>Rawah</td>\n",
       "      <td>C7746</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2595</td>\n",
       "      <td>45</td>\n",
       "      <td>2</td>\n",
       "      <td>153</td>\n",
       "      <td>-1</td>\n",
       "      <td>391</td>\n",
       "      <td>220</td>\n",
       "      <td>234</td>\n",
       "      <td>150</td>\n",
       "      <td>6172</td>\n",
       "      <td>Rawah</td>\n",
       "      <td>C7745</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>581007</th>\n",
       "      <td>2396</td>\n",
       "      <td>153</td>\n",
       "      <td>20</td>\n",
       "      <td>85</td>\n",
       "      <td>17</td>\n",
       "      <td>108</td>\n",
       "      <td>240</td>\n",
       "      <td>237</td>\n",
       "      <td>118</td>\n",
       "      <td>837</td>\n",
       "      <td>Commanche</td>\n",
       "      <td>C2703</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>581008</th>\n",
       "      <td>2391</td>\n",
       "      <td>152</td>\n",
       "      <td>19</td>\n",
       "      <td>67</td>\n",
       "      <td>12</td>\n",
       "      <td>95</td>\n",
       "      <td>240</td>\n",
       "      <td>237</td>\n",
       "      <td>119</td>\n",
       "      <td>845</td>\n",
       "      <td>Commanche</td>\n",
       "      <td>C2703</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>581009</th>\n",
       "      <td>2386</td>\n",
       "      <td>159</td>\n",
       "      <td>17</td>\n",
       "      <td>60</td>\n",
       "      <td>7</td>\n",
       "      <td>90</td>\n",
       "      <td>236</td>\n",
       "      <td>241</td>\n",
       "      <td>130</td>\n",
       "      <td>854</td>\n",
       "      <td>Commanche</td>\n",
       "      <td>C2703</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>581010</th>\n",
       "      <td>2384</td>\n",
       "      <td>170</td>\n",
       "      <td>15</td>\n",
       "      <td>60</td>\n",
       "      <td>5</td>\n",
       "      <td>90</td>\n",
       "      <td>230</td>\n",
       "      <td>245</td>\n",
       "      <td>143</td>\n",
       "      <td>864</td>\n",
       "      <td>Commanche</td>\n",
       "      <td>C2703</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>581011</th>\n",
       "      <td>2383</td>\n",
       "      <td>165</td>\n",
       "      <td>13</td>\n",
       "      <td>60</td>\n",
       "      <td>4</td>\n",
       "      <td>67</td>\n",
       "      <td>231</td>\n",
       "      <td>244</td>\n",
       "      <td>141</td>\n",
       "      <td>875</td>\n",
       "      <td>Commanche</td>\n",
       "      <td>C2703</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>581012 rows × 13 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        Elevation  Aspect  Slope  Horizontal_Distance_To_Hydrology  \\\n",
       "0            2596      51      3                               258   \n",
       "1            2590      56      2                               212   \n",
       "2            2804     139      9                               268   \n",
       "3            2785     155     18                               242   \n",
       "4            2595      45      2                               153   \n",
       "...           ...     ...    ...                               ...   \n",
       "581007       2396     153     20                                85   \n",
       "581008       2391     152     19                                67   \n",
       "581009       2386     159     17                                60   \n",
       "581010       2384     170     15                                60   \n",
       "581011       2383     165     13                                60   \n",
       "\n",
       "        Vertical_Distance_To_Hydrology  Horizontal_Distance_To_Roadways  \\\n",
       "0                                    0                              510   \n",
       "1                                   -6                              390   \n",
       "2                                   65                             3180   \n",
       "3                                  118                             3090   \n",
       "4                                   -1                              391   \n",
       "...                                ...                              ...   \n",
       "581007                              17                              108   \n",
       "581008                              12                               95   \n",
       "581009                               7                               90   \n",
       "581010                               5                               90   \n",
       "581011                               4                               67   \n",
       "\n",
       "        Hillshade_9am  Hillshade_Noon  Hillshade_3pm  \\\n",
       "0                 221             232            148   \n",
       "1                 220             235            151   \n",
       "2                 234             238            135   \n",
       "3                 238             238            122   \n",
       "4                 220             234            150   \n",
       "...               ...             ...            ...   \n",
       "581007            240             237            118   \n",
       "581008            240             237            119   \n",
       "581009            236             241            130   \n",
       "581010            230             245            143   \n",
       "581011            231             244            141   \n",
       "\n",
       "        Horizontal_Distance_To_Fire_Points Wilderness_Area Soil_Type  \\\n",
       "0                                     6279           Rawah     C7745   \n",
       "1                                     6225           Rawah     C7745   \n",
       "2                                     6121           Rawah     C4744   \n",
       "3                                     6211           Rawah     C7746   \n",
       "4                                     6172           Rawah     C7745   \n",
       "...                                    ...             ...       ...   \n",
       "581007                                 837       Commanche     C2703   \n",
       "581008                                 845       Commanche     C2703   \n",
       "581009                                 854       Commanche     C2703   \n",
       "581010                                 864       Commanche     C2703   \n",
       "581011                                 875       Commanche     C2703   \n",
       "\n",
       "        Cover_Type  \n",
       "0                4  \n",
       "1                4  \n",
       "2                1  \n",
       "3                1  \n",
       "4                4  \n",
       "...            ...  \n",
       "581007           2  \n",
       "581008           2  \n",
       "581009           2  \n",
       "581010           2  \n",
       "581011           2  \n",
       "\n",
       "[581012 rows x 13 columns]"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_full = df = pd.read_csv(FULL_DATASET, dtype={'Soil_Type': object})\n",
    "df_full"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "C7745    115247\n",
       "C7202     57752\n",
       "C7756     52519\n",
       "C7757     45154\n",
       "C7201     33373\n",
       "C4703     32634\n",
       "C7746     30170\n",
       "C4744     29971\n",
       "C7755     25666\n",
       "C7700     21278\n",
       "C4758     17431\n",
       "C8771     15573\n",
       "C8772     13806\n",
       "C4704     12410\n",
       "C2705     12396\n",
       "C7102      9259\n",
       "C8776      8750\n",
       "C2703      7525\n",
       "C2717      6575\n",
       "C2704      4823\n",
       "C7101      4021\n",
       "C6102      3422\n",
       "C2702      3031\n",
       "C6101      2845\n",
       "C7702      2589\n",
       "C6731      1899\n",
       "C8703      1891\n",
       "C7790      1611\n",
       "C2706      1597\n",
       "C4201      1147\n",
       "C7709      1086\n",
       "C7710       946\n",
       "C7103       838\n",
       "C5101       599\n",
       "C7701       474\n",
       "C8708       298\n",
       "C3502       179\n",
       "C8707       119\n",
       "C3501       105\n",
       "C5151         3\n",
       "Name: Soil_Type, dtype: int64"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_full.Soil_Type.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_5151 = df_full[df_full['Soil_Type']=='C5151']\n",
    "df_no_5151 = df_full[df_full['Soil_Type']!='C5151']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Elevation</th>\n",
       "      <th>Aspect</th>\n",
       "      <th>Slope</th>\n",
       "      <th>Horizontal_Distance_To_Hydrology</th>\n",
       "      <th>Vertical_Distance_To_Hydrology</th>\n",
       "      <th>Horizontal_Distance_To_Roadways</th>\n",
       "      <th>Hillshade_9am</th>\n",
       "      <th>Hillshade_Noon</th>\n",
       "      <th>Hillshade_3pm</th>\n",
       "      <th>Horizontal_Distance_To_Fire_Points</th>\n",
       "      <th>Wilderness_Area</th>\n",
       "      <th>Soil_Type</th>\n",
       "      <th>Cover_Type</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>241543</th>\n",
       "      <td>2078</td>\n",
       "      <td>34</td>\n",
       "      <td>10</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>212</td>\n",
       "      <td>219</td>\n",
       "      <td>218</td>\n",
       "      <td>134</td>\n",
       "      <td>484</td>\n",
       "      <td>Cache</td>\n",
       "      <td>C5151</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>241544</th>\n",
       "      <td>2080</td>\n",
       "      <td>13</td>\n",
       "      <td>19</td>\n",
       "      <td>30</td>\n",
       "      <td>0</td>\n",
       "      <td>192</td>\n",
       "      <td>198</td>\n",
       "      <td>197</td>\n",
       "      <td>132</td>\n",
       "      <td>499</td>\n",
       "      <td>Cache</td>\n",
       "      <td>C5151</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>241545</th>\n",
       "      <td>2076</td>\n",
       "      <td>27</td>\n",
       "      <td>24</td>\n",
       "      <td>30</td>\n",
       "      <td>5</td>\n",
       "      <td>175</td>\n",
       "      <td>201</td>\n",
       "      <td>180</td>\n",
       "      <td>105</td>\n",
       "      <td>516</td>\n",
       "      <td>Cache</td>\n",
       "      <td>C5151</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        Elevation  Aspect  Slope  Horizontal_Distance_To_Hydrology  \\\n",
       "241543       2078      34     10                                 0   \n",
       "241544       2080      13     19                                30   \n",
       "241545       2076      27     24                                30   \n",
       "\n",
       "        Vertical_Distance_To_Hydrology  Horizontal_Distance_To_Roadways  \\\n",
       "241543                               0                              212   \n",
       "241544                               0                              192   \n",
       "241545                               5                              175   \n",
       "\n",
       "        Hillshade_9am  Hillshade_Noon  Hillshade_3pm  \\\n",
       "241543            219             218            134   \n",
       "241544            198             197            132   \n",
       "241545            201             180            105   \n",
       "\n",
       "        Horizontal_Distance_To_Fire_Points Wilderness_Area Soil_Type  \\\n",
       "241543                                 484           Cache     C5151   \n",
       "241544                                 499           Cache     C5151   \n",
       "241545                                 516           Cache     C5151   \n",
       "\n",
       "        Cover_Type  \n",
       "241543           5  \n",
       "241544           5  \n",
       "241545           5  "
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_5151"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Elevation</th>\n",
       "      <th>Aspect</th>\n",
       "      <th>Slope</th>\n",
       "      <th>Horizontal_Distance_To_Hydrology</th>\n",
       "      <th>Vertical_Distance_To_Hydrology</th>\n",
       "      <th>Horizontal_Distance_To_Roadways</th>\n",
       "      <th>Hillshade_9am</th>\n",
       "      <th>Hillshade_Noon</th>\n",
       "      <th>Hillshade_3pm</th>\n",
       "      <th>Horizontal_Distance_To_Fire_Points</th>\n",
       "      <th>Wilderness_Area</th>\n",
       "      <th>Soil_Type</th>\n",
       "      <th>Cover_Type</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2596</td>\n",
       "      <td>51</td>\n",
       "      <td>3</td>\n",
       "      <td>258</td>\n",
       "      <td>0</td>\n",
       "      <td>510</td>\n",
       "      <td>221</td>\n",
       "      <td>232</td>\n",
       "      <td>148</td>\n",
       "      <td>6279</td>\n",
       "      <td>Rawah</td>\n",
       "      <td>C7745</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2590</td>\n",
       "      <td>56</td>\n",
       "      <td>2</td>\n",
       "      <td>212</td>\n",
       "      <td>-6</td>\n",
       "      <td>390</td>\n",
       "      <td>220</td>\n",
       "      <td>235</td>\n",
       "      <td>151</td>\n",
       "      <td>6225</td>\n",
       "      <td>Rawah</td>\n",
       "      <td>C7745</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2804</td>\n",
       "      <td>139</td>\n",
       "      <td>9</td>\n",
       "      <td>268</td>\n",
       "      <td>65</td>\n",
       "      <td>3180</td>\n",
       "      <td>234</td>\n",
       "      <td>238</td>\n",
       "      <td>135</td>\n",
       "      <td>6121</td>\n",
       "      <td>Rawah</td>\n",
       "      <td>C4744</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2785</td>\n",
       "      <td>155</td>\n",
       "      <td>18</td>\n",
       "      <td>242</td>\n",
       "      <td>118</td>\n",
       "      <td>3090</td>\n",
       "      <td>238</td>\n",
       "      <td>238</td>\n",
       "      <td>122</td>\n",
       "      <td>6211</td>\n",
       "      <td>Rawah</td>\n",
       "      <td>C7746</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2595</td>\n",
       "      <td>45</td>\n",
       "      <td>2</td>\n",
       "      <td>153</td>\n",
       "      <td>-1</td>\n",
       "      <td>391</td>\n",
       "      <td>220</td>\n",
       "      <td>234</td>\n",
       "      <td>150</td>\n",
       "      <td>6172</td>\n",
       "      <td>Rawah</td>\n",
       "      <td>C7745</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>581007</th>\n",
       "      <td>2396</td>\n",
       "      <td>153</td>\n",
       "      <td>20</td>\n",
       "      <td>85</td>\n",
       "      <td>17</td>\n",
       "      <td>108</td>\n",
       "      <td>240</td>\n",
       "      <td>237</td>\n",
       "      <td>118</td>\n",
       "      <td>837</td>\n",
       "      <td>Commanche</td>\n",
       "      <td>C2703</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>581008</th>\n",
       "      <td>2391</td>\n",
       "      <td>152</td>\n",
       "      <td>19</td>\n",
       "      <td>67</td>\n",
       "      <td>12</td>\n",
       "      <td>95</td>\n",
       "      <td>240</td>\n",
       "      <td>237</td>\n",
       "      <td>119</td>\n",
       "      <td>845</td>\n",
       "      <td>Commanche</td>\n",
       "      <td>C2703</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>581009</th>\n",
       "      <td>2386</td>\n",
       "      <td>159</td>\n",
       "      <td>17</td>\n",
       "      <td>60</td>\n",
       "      <td>7</td>\n",
       "      <td>90</td>\n",
       "      <td>236</td>\n",
       "      <td>241</td>\n",
       "      <td>130</td>\n",
       "      <td>854</td>\n",
       "      <td>Commanche</td>\n",
       "      <td>C2703</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>581010</th>\n",
       "      <td>2384</td>\n",
       "      <td>170</td>\n",
       "      <td>15</td>\n",
       "      <td>60</td>\n",
       "      <td>5</td>\n",
       "      <td>90</td>\n",
       "      <td>230</td>\n",
       "      <td>245</td>\n",
       "      <td>143</td>\n",
       "      <td>864</td>\n",
       "      <td>Commanche</td>\n",
       "      <td>C2703</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>581011</th>\n",
       "      <td>2383</td>\n",
       "      <td>165</td>\n",
       "      <td>13</td>\n",
       "      <td>60</td>\n",
       "      <td>4</td>\n",
       "      <td>67</td>\n",
       "      <td>231</td>\n",
       "      <td>244</td>\n",
       "      <td>141</td>\n",
       "      <td>875</td>\n",
       "      <td>Commanche</td>\n",
       "      <td>C2703</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>581009 rows × 13 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        Elevation  Aspect  Slope  Horizontal_Distance_To_Hydrology  \\\n",
       "0            2596      51      3                               258   \n",
       "1            2590      56      2                               212   \n",
       "2            2804     139      9                               268   \n",
       "3            2785     155     18                               242   \n",
       "4            2595      45      2                               153   \n",
       "...           ...     ...    ...                               ...   \n",
       "581007       2396     153     20                                85   \n",
       "581008       2391     152     19                                67   \n",
       "581009       2386     159     17                                60   \n",
       "581010       2384     170     15                                60   \n",
       "581011       2383     165     13                                60   \n",
       "\n",
       "        Vertical_Distance_To_Hydrology  Horizontal_Distance_To_Roadways  \\\n",
       "0                                    0                              510   \n",
       "1                                   -6                              390   \n",
       "2                                   65                             3180   \n",
       "3                                  118                             3090   \n",
       "4                                   -1                              391   \n",
       "...                                ...                              ...   \n",
       "581007                              17                              108   \n",
       "581008                              12                               95   \n",
       "581009                               7                               90   \n",
       "581010                               5                               90   \n",
       "581011                               4                               67   \n",
       "\n",
       "        Hillshade_9am  Hillshade_Noon  Hillshade_3pm  \\\n",
       "0                 221             232            148   \n",
       "1                 220             235            151   \n",
       "2                 234             238            135   \n",
       "3                 238             238            122   \n",
       "4                 220             234            150   \n",
       "...               ...             ...            ...   \n",
       "581007            240             237            118   \n",
       "581008            240             237            119   \n",
       "581009            236             241            130   \n",
       "581010            230             245            143   \n",
       "581011            231             244            141   \n",
       "\n",
       "        Horizontal_Distance_To_Fire_Points Wilderness_Area Soil_Type  \\\n",
       "0                                     6279           Rawah     C7745   \n",
       "1                                     6225           Rawah     C7745   \n",
       "2                                     6121           Rawah     C4744   \n",
       "3                                     6211           Rawah     C7746   \n",
       "4                                     6172           Rawah     C7745   \n",
       "...                                    ...             ...       ...   \n",
       "581007                                 837       Commanche     C2703   \n",
       "581008                                 845       Commanche     C2703   \n",
       "581009                                 854       Commanche     C2703   \n",
       "581010                                 864       Commanche     C2703   \n",
       "581011                                 875       Commanche     C2703   \n",
       "\n",
       "        Cover_Type  \n",
       "0                4  \n",
       "1                4  \n",
       "2                1  \n",
       "3                1  \n",
       "4                4  \n",
       "...            ...  \n",
       "581007           2  \n",
       "581008           2  \n",
       "581009           2  \n",
       "581010           2  \n",
       "581011           2  \n",
       "\n",
       "[581009 rows x 13 columns]"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_no_5151"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_small, df_other = train_test_split(df_no_5151, train_size=100000, stratify=df_no_5151.Cover_Type)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(431009, 13)\n",
      "(75000, 13)\n",
      "(75000, 12)\n"
     ]
    }
   ],
   "source": [
    "df_train, df_other = train_test_split(df_no_5151, train_size=431009, stratify=df_no_5151.Cover_Type)\n",
    "df_evaluate, df_serving = train_test_split(df_other, train_size=75000, stratify=df_other.Cover_Type)\n",
    "df_serving = df_serving.drop(columns=['Cover_Type'])\n",
    "print(df_train.shape)\n",
    "print(df_evaluate.shape)\n",
    "print(df_serving.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Add some missing values to the training split."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Elevation</th>\n",
       "      <th>Aspect</th>\n",
       "      <th>Slope</th>\n",
       "      <th>Horizontal_Distance_To_Hydrology</th>\n",
       "      <th>Vertical_Distance_To_Hydrology</th>\n",
       "      <th>Horizontal_Distance_To_Roadways</th>\n",
       "      <th>Hillshade_9am</th>\n",
       "      <th>Hillshade_Noon</th>\n",
       "      <th>Hillshade_3pm</th>\n",
       "      <th>Horizontal_Distance_To_Fire_Points</th>\n",
       "      <th>Wilderness_Area</th>\n",
       "      <th>Soil_Type</th>\n",
       "      <th>Cover_Type</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>3212</td>\n",
       "      <td>191</td>\n",
       "      <td>10</td>\n",
       "      <td>NaN</td>\n",
       "      <td>15</td>\n",
       "      <td>5455</td>\n",
       "      <td>220</td>\n",
       "      <td>248</td>\n",
       "      <td>161</td>\n",
       "      <td>1126</td>\n",
       "      <td>Rawah</td>\n",
       "      <td>C7745</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3205</td>\n",
       "      <td>3</td>\n",
       "      <td>14</td>\n",
       "      <td>NaN</td>\n",
       "      <td>16</td>\n",
       "      <td>4230</td>\n",
       "      <td>200</td>\n",
       "      <td>213</td>\n",
       "      <td>149</td>\n",
       "      <td>2065</td>\n",
       "      <td>Commanche</td>\n",
       "      <td>C7201</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2605</td>\n",
       "      <td>74</td>\n",
       "      <td>16</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-14</td>\n",
       "      <td>1195</td>\n",
       "      <td>237</td>\n",
       "      <td>208</td>\n",
       "      <td>98</td>\n",
       "      <td>295</td>\n",
       "      <td>Rawah</td>\n",
       "      <td>C7201</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2768</td>\n",
       "      <td>73</td>\n",
       "      <td>31</td>\n",
       "      <td>NaN</td>\n",
       "      <td>42</td>\n",
       "      <td>268</td>\n",
       "      <td>238</td>\n",
       "      <td>164</td>\n",
       "      <td>34</td>\n",
       "      <td>2049</td>\n",
       "      <td>Rawah</td>\n",
       "      <td>C7746</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>3230</td>\n",
       "      <td>45</td>\n",
       "      <td>13</td>\n",
       "      <td>NaN</td>\n",
       "      <td>20</td>\n",
       "      <td>2809</td>\n",
       "      <td>223</td>\n",
       "      <td>211</td>\n",
       "      <td>120</td>\n",
       "      <td>1075</td>\n",
       "      <td>Commanche</td>\n",
       "      <td>C7201</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>431004</th>\n",
       "      <td>2950</td>\n",
       "      <td>36</td>\n",
       "      <td>4</td>\n",
       "      <td>108.0</td>\n",
       "      <td>1</td>\n",
       "      <td>2037</td>\n",
       "      <td>219</td>\n",
       "      <td>230</td>\n",
       "      <td>148</td>\n",
       "      <td>306</td>\n",
       "      <td>Commanche</td>\n",
       "      <td>C7756</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>431005</th>\n",
       "      <td>2837</td>\n",
       "      <td>278</td>\n",
       "      <td>10</td>\n",
       "      <td>30.0</td>\n",
       "      <td>6</td>\n",
       "      <td>604</td>\n",
       "      <td>193</td>\n",
       "      <td>242</td>\n",
       "      <td>189</td>\n",
       "      <td>1664</td>\n",
       "      <td>Commanche</td>\n",
       "      <td>C4758</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>431006</th>\n",
       "      <td>3101</td>\n",
       "      <td>152</td>\n",
       "      <td>9</td>\n",
       "      <td>150.0</td>\n",
       "      <td>-1</td>\n",
       "      <td>1018</td>\n",
       "      <td>232</td>\n",
       "      <td>240</td>\n",
       "      <td>139</td>\n",
       "      <td>1655</td>\n",
       "      <td>Rawah</td>\n",
       "      <td>C7201</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>431007</th>\n",
       "      <td>3228</td>\n",
       "      <td>136</td>\n",
       "      <td>14</td>\n",
       "      <td>216.0</td>\n",
       "      <td>41</td>\n",
       "      <td>2797</td>\n",
       "      <td>241</td>\n",
       "      <td>234</td>\n",
       "      <td>119</td>\n",
       "      <td>997</td>\n",
       "      <td>Commanche</td>\n",
       "      <td>C7202</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>431008</th>\n",
       "      <td>3060</td>\n",
       "      <td>358</td>\n",
       "      <td>16</td>\n",
       "      <td>495.0</td>\n",
       "      <td>16</td>\n",
       "      <td>3619</td>\n",
       "      <td>193</td>\n",
       "      <td>209</td>\n",
       "      <td>152</td>\n",
       "      <td>2773</td>\n",
       "      <td>Rawah</td>\n",
       "      <td>C7745</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>431009 rows × 13 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        Elevation  Aspect  Slope  Horizontal_Distance_To_Hydrology  \\\n",
       "0            3212     191     10                               NaN   \n",
       "1            3205       3     14                               NaN   \n",
       "2            2605      74     16                               NaN   \n",
       "3            2768      73     31                               NaN   \n",
       "4            3230      45     13                               NaN   \n",
       "...           ...     ...    ...                               ...   \n",
       "431004       2950      36      4                             108.0   \n",
       "431005       2837     278     10                              30.0   \n",
       "431006       3101     152      9                             150.0   \n",
       "431007       3228     136     14                             216.0   \n",
       "431008       3060     358     16                             495.0   \n",
       "\n",
       "        Vertical_Distance_To_Hydrology  Horizontal_Distance_To_Roadways  \\\n",
       "0                                   15                             5455   \n",
       "1                                   16                             4230   \n",
       "2                                  -14                             1195   \n",
       "3                                   42                              268   \n",
       "4                                   20                             2809   \n",
       "...                                ...                              ...   \n",
       "431004                               1                             2037   \n",
       "431005                               6                              604   \n",
       "431006                              -1                             1018   \n",
       "431007                              41                             2797   \n",
       "431008                              16                             3619   \n",
       "\n",
       "        Hillshade_9am  Hillshade_Noon  Hillshade_3pm  \\\n",
       "0                 220             248            161   \n",
       "1                 200             213            149   \n",
       "2                 237             208             98   \n",
       "3                 238             164             34   \n",
       "4                 223             211            120   \n",
       "...               ...             ...            ...   \n",
       "431004            219             230            148   \n",
       "431005            193             242            189   \n",
       "431006            232             240            139   \n",
       "431007            241             234            119   \n",
       "431008            193             209            152   \n",
       "\n",
       "        Horizontal_Distance_To_Fire_Points Wilderness_Area Soil_Type  \\\n",
       "0                                     1126           Rawah     C7745   \n",
       "1                                     2065       Commanche     C7201   \n",
       "2                                      295           Rawah     C7201   \n",
       "3                                     2049           Rawah     C7746   \n",
       "4                                     1075       Commanche     C7201   \n",
       "...                                    ...             ...       ...   \n",
       "431004                                 306       Commanche     C7756   \n",
       "431005                                1664       Commanche     C4758   \n",
       "431006                                1655           Rawah     C7201   \n",
       "431007                                 997       Commanche     C7202   \n",
       "431008                                2773           Rawah     C7745   \n",
       "\n",
       "        Cover_Type  \n",
       "0                0  \n",
       "1                0  \n",
       "2                1  \n",
       "3                4  \n",
       "4                0  \n",
       "...            ...  \n",
       "431004           1  \n",
       "431005           1  \n",
       "431006           0  \n",
       "431007           0  \n",
       "431008           1  \n",
       "\n",
       "[431009 rows x 13 columns]"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_train_missing = df_train.reset_index(drop=True)\n",
    "df_train_missing.loc[0:8999, 'Horizontal_Distance_To_Hydrology'] = None\n",
    "df_train_missing"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Create the evaluation split where some values of Slope are more than 90 degrees and 3 examples have 5151 code for soil type, which is not present in the training split."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Elevation</th>\n",
       "      <th>Aspect</th>\n",
       "      <th>Slope</th>\n",
       "      <th>Horizontal_Distance_To_Hydrology</th>\n",
       "      <th>Vertical_Distance_To_Hydrology</th>\n",
       "      <th>Horizontal_Distance_To_Roadways</th>\n",
       "      <th>Hillshade_9am</th>\n",
       "      <th>Hillshade_Noon</th>\n",
       "      <th>Hillshade_3pm</th>\n",
       "      <th>Horizontal_Distance_To_Fire_Points</th>\n",
       "      <th>Wilderness_Area</th>\n",
       "      <th>Soil_Type</th>\n",
       "      <th>Cover_Type</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>3001</td>\n",
       "      <td>96</td>\n",
       "      <td>110</td>\n",
       "      <td>534</td>\n",
       "      <td>16</td>\n",
       "      <td>5234</td>\n",
       "      <td>231</td>\n",
       "      <td>231</td>\n",
       "      <td>133</td>\n",
       "      <td>5454</td>\n",
       "      <td>Rawah</td>\n",
       "      <td>C4744</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>3005</td>\n",
       "      <td>139</td>\n",
       "      <td>110</td>\n",
       "      <td>175</td>\n",
       "      <td>33</td>\n",
       "      <td>2405</td>\n",
       "      <td>236</td>\n",
       "      <td>237</td>\n",
       "      <td>131</td>\n",
       "      <td>612</td>\n",
       "      <td>Rawah</td>\n",
       "      <td>C7745</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2768</td>\n",
       "      <td>91</td>\n",
       "      <td>110</td>\n",
       "      <td>242</td>\n",
       "      <td>119</td>\n",
       "      <td>256</td>\n",
       "      <td>248</td>\n",
       "      <td>193</td>\n",
       "      <td>59</td>\n",
       "      <td>1890</td>\n",
       "      <td>Rawah</td>\n",
       "      <td>C7746</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3153</td>\n",
       "      <td>346</td>\n",
       "      <td>110</td>\n",
       "      <td>277</td>\n",
       "      <td>-32</td>\n",
       "      <td>1328</td>\n",
       "      <td>203</td>\n",
       "      <td>227</td>\n",
       "      <td>162</td>\n",
       "      <td>902</td>\n",
       "      <td>Neota</td>\n",
       "      <td>C7700</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>3379</td>\n",
       "      <td>68</td>\n",
       "      <td>110</td>\n",
       "      <td>150</td>\n",
       "      <td>-12</td>\n",
       "      <td>3609</td>\n",
       "      <td>236</td>\n",
       "      <td>192</td>\n",
       "      <td>77</td>\n",
       "      <td>2658</td>\n",
       "      <td>Rawah</td>\n",
       "      <td>C8708</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>74998</th>\n",
       "      <td>2958</td>\n",
       "      <td>61</td>\n",
       "      <td>24</td>\n",
       "      <td>234</td>\n",
       "      <td>99</td>\n",
       "      <td>5432</td>\n",
       "      <td>231</td>\n",
       "      <td>182</td>\n",
       "      <td>69</td>\n",
       "      <td>834</td>\n",
       "      <td>Rawah</td>\n",
       "      <td>C7746</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>74999</th>\n",
       "      <td>3159</td>\n",
       "      <td>132</td>\n",
       "      <td>14</td>\n",
       "      <td>150</td>\n",
       "      <td>17</td>\n",
       "      <td>3353</td>\n",
       "      <td>241</td>\n",
       "      <td>232</td>\n",
       "      <td>118</td>\n",
       "      <td>633</td>\n",
       "      <td>Commanche</td>\n",
       "      <td>C7201</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>241543</th>\n",
       "      <td>2078</td>\n",
       "      <td>34</td>\n",
       "      <td>10</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>212</td>\n",
       "      <td>219</td>\n",
       "      <td>218</td>\n",
       "      <td>134</td>\n",
       "      <td>484</td>\n",
       "      <td>Cache</td>\n",
       "      <td>C5151</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>241544</th>\n",
       "      <td>2080</td>\n",
       "      <td>13</td>\n",
       "      <td>19</td>\n",
       "      <td>30</td>\n",
       "      <td>0</td>\n",
       "      <td>192</td>\n",
       "      <td>198</td>\n",
       "      <td>197</td>\n",
       "      <td>132</td>\n",
       "      <td>499</td>\n",
       "      <td>Cache</td>\n",
       "      <td>C5151</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>241545</th>\n",
       "      <td>2076</td>\n",
       "      <td>27</td>\n",
       "      <td>24</td>\n",
       "      <td>30</td>\n",
       "      <td>5</td>\n",
       "      <td>175</td>\n",
       "      <td>201</td>\n",
       "      <td>180</td>\n",
       "      <td>105</td>\n",
       "      <td>516</td>\n",
       "      <td>Cache</td>\n",
       "      <td>C5151</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>75003 rows × 13 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        Elevation  Aspect  Slope  Horizontal_Distance_To_Hydrology  \\\n",
       "0            3001      96    110                               534   \n",
       "1            3005     139    110                               175   \n",
       "2            2768      91    110                               242   \n",
       "3            3153     346    110                               277   \n",
       "4            3379      68    110                               150   \n",
       "...           ...     ...    ...                               ...   \n",
       "74998        2958      61     24                               234   \n",
       "74999        3159     132     14                               150   \n",
       "241543       2078      34     10                                 0   \n",
       "241544       2080      13     19                                30   \n",
       "241545       2076      27     24                                30   \n",
       "\n",
       "        Vertical_Distance_To_Hydrology  Horizontal_Distance_To_Roadways  \\\n",
       "0                                   16                             5234   \n",
       "1                                   33                             2405   \n",
       "2                                  119                              256   \n",
       "3                                  -32                             1328   \n",
       "4                                  -12                             3609   \n",
       "...                                ...                              ...   \n",
       "74998                               99                             5432   \n",
       "74999                               17                             3353   \n",
       "241543                               0                              212   \n",
       "241544                               0                              192   \n",
       "241545                               5                              175   \n",
       "\n",
       "        Hillshade_9am  Hillshade_Noon  Hillshade_3pm  \\\n",
       "0                 231             231            133   \n",
       "1                 236             237            131   \n",
       "2                 248             193             59   \n",
       "3                 203             227            162   \n",
       "4                 236             192             77   \n",
       "...               ...             ...            ...   \n",
       "74998             231             182             69   \n",
       "74999             241             232            118   \n",
       "241543            219             218            134   \n",
       "241544            198             197            132   \n",
       "241545            201             180            105   \n",
       "\n",
       "        Horizontal_Distance_To_Fire_Points Wilderness_Area Soil_Type  \\\n",
       "0                                     5454           Rawah     C4744   \n",
       "1                                      612           Rawah     C7745   \n",
       "2                                     1890           Rawah     C7746   \n",
       "3                                      902           Neota     C7700   \n",
       "4                                     2658           Rawah     C8708   \n",
       "...                                    ...             ...       ...   \n",
       "74998                                  834           Rawah     C7746   \n",
       "74999                                  633       Commanche     C7201   \n",
       "241543                                 484           Cache     C5151   \n",
       "241544                                 499           Cache     C5151   \n",
       "241545                                 516           Cache     C5151   \n",
       "\n",
       "        Cover_Type  \n",
       "0                1  \n",
       "1                0  \n",
       "2                4  \n",
       "3                1  \n",
       "4                6  \n",
       "...            ...  \n",
       "74998            1  \n",
       "74999            1  \n",
       "241543           5  \n",
       "241544           5  \n",
       "241545           5  \n",
       "\n",
       "[75003 rows x 13 columns]"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_evaluate_anomalies = df_evaluate.reset_index(drop=True)\n",
    "df_evaluate_anomalies.loc[0:4, 'Slope'] = 110\n",
    "df_evaluate_anomalies = pd.concat([df_evaluate_anomalies, df_5151])\n",
    "df_evaluate_anomalies"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "C7745    14939\n",
       "C7202     7512\n",
       "C7756     6731\n",
       "C7757     5740\n",
       "C7201     4249\n",
       "C4703     4167\n",
       "C4744     3905\n",
       "C7746     3851\n",
       "C7755     3408\n",
       "C7700     2838\n",
       "C4758     2231\n",
       "C8771     1992\n",
       "C8772     1841\n",
       "C4704     1598\n",
       "C2705     1592\n",
       "C7102     1091\n",
       "C8776     1078\n",
       "C2703      968\n",
       "C2717      871\n",
       "C2704      636\n",
       "C7101      489\n",
       "C6102      452\n",
       "C2702      420\n",
       "C6101      383\n",
       "C7702      316\n",
       "C8703      257\n",
       "C6731      242\n",
       "C7790      214\n",
       "C2706      211\n",
       "C4201      147\n",
       "C7709      145\n",
       "C7710      135\n",
       "C7103      109\n",
       "C5101       74\n",
       "C7701       63\n",
       "C8708       52\n",
       "C3502       21\n",
       "C3501       16\n",
       "C8707       16\n",
       "C5151        3\n",
       "Name: Soil_Type, dtype: int64"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_evaluate_anomalies.Soil_Type.value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Save the splits to local files."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train.to_csv(TRAINING_DATASET, header=True, index=False)\n",
    "df_small.to_csv(SMALL_DATASET, header=True, index=False)\n",
    "df_train_missing.to_csv(TRAINING_DATASET_WITH_MISSING, header=True, index=False)\n",
    "df_evaluate.to_csv(EVALUATION_DATASET, header=True, index=False)\n",
    "df_evaluate_anomalies.to_csv(EVALUATION_DATASET_WITH_ANOMALIES, header=True, index=False)\n",
    "df_serving.to_csv(SERVING_DATASET, header=True, index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Copy the splits to GCS"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Copying file://../covertype.csv [Content-Type=text/csv]...\n",
      "- [1 files][ 30.5 MiB/ 30.5 MiB]                                                \n",
      "Operation completed over 1 objects/30.5 MiB.                                     \n",
      "Copying file://../covertype_small.csv [Content-Type=text/csv]...\n",
      "/ [1 files][  5.3 MiB/  5.3 MiB]                                                \n",
      "Operation completed over 1 objects/5.3 MiB.                                      \n",
      "Copying file://../covertype_training.csv [Content-Type=text/csv]...\n",
      "- [1 files][ 22.7 MiB/ 22.7 MiB]                                                \n",
      "Operation completed over 1 objects/22.7 MiB.                                     \n",
      "Copying file://../covertype_training_missing.csv [Content-Type=text/csv]...\n",
      "- [1 files][ 23.4 MiB/ 23.4 MiB]                                                \n",
      "Operation completed over 1 objects/23.4 MiB.                                     \n",
      "Copying file://../covertype_evaluation.csv [Content-Type=text/csv]...\n",
      "/ [1 files][  3.9 MiB/  3.9 MiB]                                                \n",
      "Operation completed over 1 objects/3.9 MiB.                                      \n",
      "Copying file://../covertype_evaluation_anomalies.csv [Content-Type=text/csv]...\n",
      "/ [1 files][  3.9 MiB/  3.9 MiB]                                                \n",
      "Operation completed over 1 objects/3.9 MiB.                                      \n",
      "Copying file://../covertype_serving.csv [Content-Type=text/csv]...\n",
      "/ [1 files][  3.8 MiB/  3.8 MiB]                                                \n",
      "Operation completed over 1 objects/3.8 MiB.                                      \n"
     ]
    }
   ],
   "source": [
    "!gsutil cp $FULL_DATASET gs://workshop-datasets/covertype/full/dataset.csv\n",
    "!gsutil cp $SMALL_DATASET gs://workshop-datasets/covertype/small/dataset.csv\n",
    "!gsutil cp $TRAINING_DATASET gs://workshop-datasets/covertype/training/dataset.csv\n",
    "!gsutil cp $TRAINING_DATASET_WITH_MISSING gs://workshop-datasets/covertype/training_missing/dataset.csv\n",
    "!gsutil cp $EVALUATION_DATASET gs://workshop-datasets/covertype/evaluation/dataset.csv\n",
    "!gsutil cp $EVALUATION_DATASET_WITH_ANOMALIES gs://workshop-datasets/covertype/evaluation_anomalies/dataset.csv\n",
    "!gsutil cp $SERVING_DATASET gs://workshop-datasets/covertype/serving/dataset.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
